******************************************************************************************
* Do-file name:	cr_data_siab_02.do 
* Task:         creates sample for correlation analysis
* Last change:  17.10.2023 
* Notes:		Based on SIAB_R7519 data
******************************************************************************************



******************************************************************************************
*** program setup
******************************************************************************************

version 17.0
clear all
macro drop _all
set linesize 90
set more off
* set trace on
discard
set seed 123456789



******************************************************************************************
*** load data and restrict sample
******************************************************************************************

*** load data
use "data/clean_work.dta", clear 

*** check data
isid vsnr_ano year

*** restrict sample
keep if pers_gr == 1						// only keep normal worker


******************************************************************************************
*** fill in missings for nationality and make it time constant (only past values used)
******************************************************************************************

*** use first non-missing value to fill in missings and overwrite non-missing values with past values
gen native2 = native
bys vsnr_ano (year): replace native2 = native2[_n-1]  if native2[_n-1] != .
label variable native2 "native (imputed, first non-missing value used, time constant)"


******************************************************************************************
*** clean data 
******************************************************************************************

*** label vsnr_ano
label variable vsnr_ano "Person ID"

*** label female
label variable female  "1 if female, 0 if male"

*** rename alter
rename alter age

*** drop variables
drop bnn quelle


******************************************************************************************
*** generate new variables 
******************************************************************************************

*** cerate log wage (only for full-time employed workers)
gen ln_wage = ln(tag_entg)  if teilzeit == 0
label variable tag_entg "average daily wage in €; only full-time empl. worker"

*** create status variable
gen status = 1  if pers_gr != .
label variable status "1=employed"

*** create emp
gen emp = 1  if status == 1
label variable emp "1=employed"

*** create full-time equivalents 
gen weight_fte = .
replace weight_fte = 1    if teilzeit == 0
replace weight_fte = 0.5  if teilzeit == 1			// working hours/week < 18
label variable weight_fte "full-time equivalent weights: 1 for full-time, 1/2 for part-time"

*** create 3 education groups (based on ausbildung_imp)
gen     edu_3 = .
replace edu_3 = 1  if ausbildung_imp == 1 | ausbildung_imp == 3
replace edu_3 = 2  if ausbildung_imp == 2 | ausbildung_imp == 4
replace edu_3 = 3  if ausbildung_imp == 5 | ausbildung_imp == 6
label define edu_3 1 "[1] None or only a school degree" 2 "[2] School & vocational" 3 "[3] Technical college/university"
label values edu_3 edu_3
label variable edu_3 "education, 3 groups (based on ausbildung_imp)"

*** create 2 education groups (based on ausbildung_imp)
gen     edu_2 = .
replace edu_2 = 1  if ausbildung_imp == 1 | ausbildung_imp == 3
replace edu_2 = 2  if ausbildung_imp == 2 | ausbildung_imp == 4 | ausbildung_imp == 5 | ausbildung_imp == 6
label variable edu_2 "education, 1=low educ, 2=high educ (based on ausbildung_imp)"

*** create indicator for east germany (berlin classified as west germany)
gen 	ost = 1  if ao_kreis >  11000 & ao_kreis != .
replace ost = 0  if ao_kreis <= 11000
label variable ost "1 if district belongs to east germany (berlin west ger., based on ao_kreis)"


******************************************************************************************
*** restrict sample
******************************************************************************************

*** drop obs with important missing values
drop if ao_kreis == .
drop if native2  == .
drop if teilzeit == .


******************************************************************************************
*** save the new dataset
******************************************************************************************

sort vsnr_ano year
compress
label data "restricted estimation sample (1975-2019)"
notes drop _dta
notes: vsnr is classified as employed if employed at 30/06 in respective year
save "data/work.dta", replace


******************************************************************************************
*** end
******************************************************************************************

exit


*========================================================================================*
Comments:
- unique identifier: vsnr_ano year
- wages are not imputed
